### ################################################################################
### Takes in Mark Pack's PollBase, and tidies it
### saves it in tidy_polls.rds (also plots it)
### ################################################################################

here::i_am("newR/02_tidy_polling_info.R")

library(rio)
library(tidyverse)
library(zoo)
library(here)

infile <- here::here("data", "PollBase-Q2-2020.xlsx")

insheets <- 2:22

parse_day_range <- function(str, default = 15){
    require(stringr)

    if (!is.character(str)) {
        str <- as.character(str)
    } 
    ## How many numbers are there
    number_of_numbers <- stringr::str_count(str, "[0-9]+")
    if (max(number_of_numbers, na.rm = TRUE) > 2) {
        warning("Something odd here")
    }
    ## Get all the numbers
    numbers <- str_extract_all(str, "[0-9]+")
    numbers <- lapply(numbers, as.numeric)
    ## Take their average
    numbers <- lapply(numbers, mean, na.rm = TRUE)
    numbers <- unlist(numbers)
    ## Replace "zeros" with missing
    numbers[is.nan(numbers)] <- NA
    numbers
}

parse_sheet <- function(sheet, file) {
    dat <- suppressWarnings(rio::import(file,
                                        which = sheet))
###
    dat <- dat %>%
        mutate(Year = ifelse(Year < 1900,
                             NA,
                             Year))
    dat <- dat %>%
        fill(Year, Month)

    ## Create a day, either based on fieldwork (if present)
    ## or Date (or present), or the middle of the month
    dat <- dat %>%
        mutate(fieldwork_day = parse_day_range(Fieldwork))

    if ("Date" %in% names(dat)) {
        dat <- dat %>%
            mutate(publication_day = parse_day_range(Date))
    } else {
        dat$publication_day <- NA_real_
    }
    
    dat <- dat %>% 
        mutate(day = case_when(!is.na(fieldwork_day) ~ fieldwork_day,
                               !is.na(publication_day) ~ publication_day,
                               TRUE ~ 15))

    ## Create a date
    dat <- dat %>%
        mutate(date = paste(Year,
                             Month,
                             day,
                            sep = "-"),
               date = as.Date(date, format = "%Y-%b-%d"))

    if (any(na.omit(dat$date) < as.Date("1940-01-01"))) {
        stop(sheet)
    }
    
    ## Make sure we have values for the three parties
    ## Sanitize some of the names at first mention if they look funny
    if (!is.element("Con", names(dat))) {
        names(dat)[grep("Con", names(dat))[1]] <- "Con"
        names(dat)[grep("Lab", names(dat))[1]] <- "Lab"
        names(dat)[grep("LD", names(dat))[1]] <- "LD"
    }

    ## What if there's an SDP column? Add it on...
    if (is.element("SDP", names(dat))) {
        dat$LD <- rowSums(dat[,c("LD", "SDP")], na.rm = TRUE)
    }
    
    dat <- dat %>%
        mutate(Con = Con / 100,
               Lab = Lab / 100,
               LD = LD / 100)

    dat <- dat %>%
        dplyr::select(date, Polling, Con, Lab, LD) %>%
        filter(complete.cases(.))
               
    dat
}

dat <- lapply(insheets, parse_sheet, file = infile)

dat <- bind_rows(dat)

### Create other category
dat$Oth <- 1 - rowSums(dat[,c("Con", "Lab", "LD")], na.rm = TRUE)

### Bring in GB shares of the national vote
gb_shares <- read.csv(here::here("data", "historicalvotecounts.csv"))

names(gb_shares) <- c("date",
                      "Con", 
                      "Lab", 
                      "LD",
                      "SNP",
                      "PC",
                      "Green",
                      "UKIP",
                      "Total")

gb_shares$Nat <- gb_shares$SNP + gb_shares$PC
gb_shares$Oth <- gb_shares$Total - rowSums(gb_shares[,2:5], na.rm = TRUE)

gb_shares$Con <- gb_shares$Con / gb_shares$Total
gb_shares$Lab <- gb_shares$Lab / gb_shares$Total
gb_shares$LD <- gb_shares$LD / gb_shares$Total
gb_shares$Nat.hist <- gb_shares$Nat / gb_shares$Total
gb_shares$Oth <- gb_shares$Oth / gb_shares$Total

gb_shares <- gb_shares %>%
    dplyr::select(date, Con, Lab, LD, Nat.hist, Oth)

dat <- merge(dat, gb_shares,
             by = "date",
             suffixes = c("", ".hist"),
             all = TRUE)

### Carry forward the missing historical values
dat <- dat %>%
    arrange(date) %>%
    fill(ends_with(".hist"), .direction = "down")

### Remove zero entries
nrow(dat)
dat <- dat[rowSums(dat[,c("Con", "Lab", "LD", "Oth")], na.rm = T) > 0,]
nrow(dat)

### Amend Nat and Oth
### Initially allocate "Nat" the Nat share of the historic Nat + Other figure
dat$Nat <- dat$Oth * (dat$Nat.hist / (dat$Nat.hist + dat$Oth.hist))
dat$Oth <- dat$Oth * (dat$Oth.hist / (dat$Nat.hist + dat$Oth.hist))

no_historical_data <- which(is.na(dat$Nat.hist))
dat$Oth[no_historical_data] <- 1 - rowSums(dat[no_historical_data,
                                           c("Con", "Lab", "LD")])

### But in some circumstances it might be unrealistic (i.e., > 50% of
### Scottish + Welsh population share)
problem_cases <- which(dat$Nat > 0.06)
surplus <- dat$Nat[problem_cases] - 0.06
dat$Nat[problem_cases] <- 0.06
dat$Oth[problem_cases] <- dat$Oth[problem_cases] + surplus

dat$ConLR <- log(dat$Con / dat$Con) - log(dat$Con.hist / dat$Con.hist)
dat$LabLR <- log(dat$Lab / dat$Con) - log(dat$Lab.hist / dat$Con.hist)
dat$LibLR <- log(dat$LD / dat$Con) - log(dat$LD.hist / dat$Con.hist)
dat$NatLR <- log(dat$Nat / dat$Con) - log(dat$Nat.hist / dat$Con.hist)
dat$OthLR <- log(dat$Oth / dat$Con) - log(dat$Oth.hist / dat$Con.hist)

dat$Con <- dat$Con - dat$Lab.hist
dat$Lab <- dat$Lab - dat$Lab.hist
dat$Lib <- dat$LD - dat$LD.hist
dat$Nat <- dat$Nat - dat$Nat.hist
dat$Oth <- dat$Oth - dat$Oth.hist

### Remove very early dates
dat <- dat %>%
    filter(!is.na(Con.hist))

dat <- dat %>%
    dplyr::select(date,
                  Con, Lab, Lib, Oth, Nat,
                  LabLR, LibLR, OthLR, NatLR)

### Create the windowed average by expanding 
### Aggregate polls on the same day
dat <- dat %>%
    group_by(date) %>%
    summarize_all(mean, na.rm = TRUE)

holder <- data.frame(date = seq(min(dat$date),
                                max(dat$date),
                                by = 1))

dat <- merge(holder, dat,
             by = "date", all = TRUE)

dat <- dat %>%
    mutate(Con2 = rollapply(Con, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           Lab2 = rollapply(Lab, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           Lib2 = rollapply(Lib, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           Nat2 = rollapply(Nat, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           Oth2 = rollapply(Oth, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           ConLR2 = rollapply(Con, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           LabLR2 = rollapply(Lab, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           LibLR2 = rollapply(Lib, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           NatLR2 = rollapply(Nat, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           OthLR2 = rollapply(Oth, width = 7,
                            FUN = mean, align = "right",
                            na.rm = TRUE, fill = NA),
           npolls = rollsum(is.na(Con), k = 7,
                            fill = 0,
                            align = "right"))

### Carry forward missing values
dat <- dat %>%
    mutate_at(vars(Con2, Lab2, Lib2, Nat2, Oth2,
                   LabLR2, LibLR2, NatLR2, OthLR2),
              function(x)replace(x, !is.finite(x), NA)) %>% 
    fill(Con2, Lab2, Lib2, Nat2, Oth2,
         LabLR2, LibLR2, NatLR2, OthLR2,
         .direction = "down")

dat <- dat %>%
    dplyr::select(date,
                  ConPollChg = Con2,
                  LabPollChg = Lab2,
                  LibPollChg = Lib2,
                  NatPollChg = Nat2,
                  OthPollChg = Oth2,
                  ConPollChgLR = ConLR2,
                  LabPollChgLR = LabLR2,
                  LibPollChgLR = LibLR2,
                  NatPollChgLR = NatLR2,
                  OthPollChgLR = OthLR2,
                  npolls)

saveRDS(dat,
        file = here::here("working",
                          "tidy_polls.rds"))


plot.df <- dat %>%
    dplyr::select(date,
                  ConPollChg,
                  LabPollChg,
                  LibPollChg,
                  NatPollChg,
                  OthPollChg) %>% 
    pivot_longer(cols = -date,
                 names_to = "party",
                 values_to = "share") %>%
    mutate(party = dplyr::recode(party,
                                 "ConPollChg" = "Conservative",
                                 "LabPollChg" = "Labour",
                                 "LibPollChg" = "Liberal (Liberal Democrat)",
                                 "NatPollChg" = "SNP/\nPlaid Cymru",
                                 "OthPollChg" = "Combined others"),
           party = factor(party,
                          levels = c("Conservative",
                                     "Labour",
                                     "Liberal (Liberal Democrat)",
                                     "SNP/\nPlaid Cymru",
                                     "Combined others"),
                          ordered = TRUE))


partycols <- c("#0087dc",
               "#d50000",
               "#FDBB30",
               "#3F8428",
               "#999999")


p1 <- ggplot(plot.df, aes(x = date, y = share,
                          colour = party)) +
    geom_line() + 
    scale_colour_manual(values = partycols,
                        guide = FALSE) +
    scale_y_continuous("Change in vote share",
                       labels = scales::percent) + 
    facet_wrap(~party) + 
    theme_bw()
